From: Keir Fraser Date: Tue, 30 Jun 2009 14:40:39 +0000 (+0100) Subject: x86 hvm mce: Support HVM Guest virtual MCA handling. X-Git-Tag: archive/raspbian/4.8.0-1+rpi1~1^2~13670 X-Git-Url: https://dgit.raspbian.org/%22http://www.example.com/cgi/%22/%22http:/www.example.com/cgi/%22?a=commitdiff_plain;h=db49191f3e52f9283133f32f28bb234ce3069b34;p=xen.git x86 hvm mce: Support HVM Guest virtual MCA handling. When MCE# happens, if the error has been contained/recovered by XEN and it impacts one guest Domain(DOM0/HVM Guest/PV Guest), we will inject the corresponding vMCE# into the impacted Domain. Guest OS will go on its own recovery job if it has MCA handler. Signed-off-by: Liping Ke Signed-off-by: Yunhong Jiang --- diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c index c87b74caf9..77226f1938 100644 --- a/xen/arch/x86/cpu/mcheck/mce_intel.c +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "mce.h" #include "x86_mca.h" @@ -224,7 +225,7 @@ static struct bank_entry* alloc_bank_entry(void) { for vMCE# MSRs virtualization */ -static int fill_vmsr_data(int cpu, struct mcinfo_bank *mc_bank, +static int fill_vmsr_data(struct mcinfo_bank *mc_bank, uint64_t gstatus) { struct domain *d; struct bank_entry *entry; @@ -240,28 +241,89 @@ static int fill_vmsr_data(int cpu, struct mcinfo_bank *mc_bank, return 0; } + /* For HVM guest, Only when first vMCE is consumed by HVM guest successfully, + * will we generete another node and inject another vMCE + */ + if ( (d->is_hvm) && (d->arch.vmca_msrs.nr_injection > 0) ) + { + printk(KERN_DEBUG "MCE: HVM guest has not handled previous" + " vMCE yet!\n"); + return -1; + } entry = alloc_bank_entry(); if (entry == NULL) - return -1; + return -1; + entry->mci_status = mc_bank->mc_status; entry->mci_addr = mc_bank->mc_addr; entry->mci_misc = mc_bank->mc_misc; - entry->cpu = cpu; entry->bank = mc_bank->mc_bank; - spin_lock(&d->arch.vmca_msrs.lock); + spin_lock(&d->arch.vmca_msrs.lock); /* New error Node, insert to the tail of the per_dom data */ list_add_tail(&entry->list, &d->arch.vmca_msrs.impact_header); /* Fill MSR global status */ d->arch.vmca_msrs.mcg_status = gstatus; /* New node impact the domain, need another vMCE# injection*/ d->arch.vmca_msrs.nr_injection++; - spin_unlock(&d->arch.vmca_msrs.lock); + spin_unlock(&d->arch.vmca_msrs.lock); - printk(KERN_DEBUG "MCE: Found error @[CPU%d BANK%d " + printk(KERN_DEBUG "MCE: Found error @[BANK%d " "status %"PRIx64" addr %"PRIx64" domid %d]\n ", - entry->cpu, mc_bank->mc_bank, - mc_bank->mc_status, mc_bank->mc_addr, mc_bank->mc_domid); + mc_bank->mc_bank, mc_bank->mc_status, mc_bank->mc_addr, + mc_bank->mc_domid); + } + return 0; +} + +static int inject_mce(struct domain *d) +{ + int cpu = smp_processor_id(); + cpumask_t affinity; + + /* PV guest and HVM guest have different vMCE# injection + * methods*/ + + if ( !test_and_set_bool(d->vcpu[0]->mce_pending) ) + { + if (d->is_hvm) + { + printk(KERN_DEBUG "MCE: inject vMCE to HVM DOM %d\n", + d->domain_id); + vcpu_kick(d->vcpu[0]); + } + /* PV guest including DOM0 */ + else + { + printk(KERN_DEBUG "MCE: inject vMCE to PV DOM%d\n", + d->domain_id); + if (guest_has_trap_callback + (d, 0, TRAP_machine_check)) + { + d->vcpu[0]->cpu_affinity_tmp = + d->vcpu[0]->cpu_affinity; + cpus_clear(affinity); + cpu_set(cpu, affinity); + printk(KERN_DEBUG "MCE: CPU%d set affinity, old %d\n", cpu, + d->vcpu[0]->processor); + vcpu_set_affinity(d->vcpu[0], &affinity); + vcpu_kick(d->vcpu[0]); + } + else + { + printk(KERN_DEBUG "MCE: Kill PV guest with No MCE handler\n"); + domain_crash(d); + } + } + } + else { + /* new vMCE comes while first one has not been injected yet, + * in this case, inject fail. [We can't lose this vMCE for + * the mce node's consistency]. + */ + printk(KERN_DEBUG "There's a pending vMCE waiting to be injected " + " to this DOM%d!\n", d->domain_id); + return -1; } return 0; } @@ -272,7 +334,7 @@ void intel_UCR_handler(struct mcinfo_bank *bank, struct mca_handle_result *result) { struct domain *d; - unsigned long mfn; + unsigned long mfn, gfn; uint32_t status; printk(KERN_DEBUG "MCE: Enter EWB UCR recovery action\n"); @@ -280,6 +342,7 @@ void intel_UCR_handler(struct mcinfo_bank *bank, if (bank->mc_addr != 0) { mfn = bank->mc_addr >> PAGE_SHIFT; if (!offline_page(mfn, 1, &status)) { + /* This is free page */ if (status & PG_OFFLINE_OFFLINED) result->result = MCA_RECOVERED; else if (status & PG_OFFLINE_PENDING) { @@ -289,9 +352,35 @@ void intel_UCR_handler(struct mcinfo_bank *bank, result->owner = status >> PG_OFFLINE_OWNER_SHIFT; printk(KERN_DEBUG "MCE: This error page is ownded" " by DOM %d\n", result->owner); - if (result->owner != 0 && result->owner != DOMID_XEN) { + /* Fill vMCE# injection and vMCE# MSR virtualization " + * "related data */ + bank->mc_domid = result->owner; + if ( result->owner != DOMID_XEN ) { d = get_domain_by_id(result->owner); - domain_crash(d); + gfn = + mfn_to_gmfn(d, ((bank->mc_addr) >> PAGE_SHIFT)); + bank->mc_addr = + gfn << PAGE_SHIFT | (bank->mc_addr & PAGE_MASK); + if (fill_vmsr_data(bank, global->mc_gstatus) == -1) + { + printk(KERN_DEBUG "Fill vMCE# data for DOM%d " + "failed\n", result->owner); + domain_crash(d); + return; + } + /* We will inject vMCE to DOMU*/ + if ( inject_mce(d) < 0 ) + { + printk(KERN_DEBUG "inject vMCE to DOM%d" + " failed\n", d->domain_id); + domain_crash(d); + return; + } + /* Impacted domain go on with domain's recovery job + * if the domain has its own MCA handler. + * For xen, it has contained the error and finished + * its own recovery job. + */ result->result = MCA_RECOVERED; } } @@ -309,7 +398,7 @@ struct mca_error_handler intel_recovery_handler[INTEL_MAX_RECOVERY] = * should be committed for dom0 consumption, 0 if it should be * dismissed. */ -static int mce_action(unsigned int cpu, mctelem_cookie_t mctc) +static int mce_action(mctelem_cookie_t mctc) { struct mc_info *local_mi; uint32_t i; @@ -335,9 +424,6 @@ static int mce_action(unsigned int cpu, mctelem_cookie_t mctc) continue; } mc_bank = (struct mcinfo_bank*)mic; - /* Fill vMCE# injection and vMCE# MSR virtualization related data */ - if (fill_vmsr_data(cpu, mc_bank, mc_global->mc_gstatus) == -1) - break; /* TODO: Add recovery actions here, such as page-offline, etc */ memset(&mca_res, 0x0f, sizeof(mca_res)); @@ -386,7 +472,6 @@ static void mce_softirq(void) { int cpu = smp_processor_id(); unsigned int workcpu; - cpumask_t affinity; printk(KERN_DEBUG "CPU%d enter softirq\n", cpu); @@ -417,7 +502,7 @@ static void mce_softirq(void) * vMCE MSRs virtualization buffer */ for_each_online_cpu(workcpu) { - mctelem_process_deferred(workcpu, mce_action); + mctelem_process_deferred(workcpu, mce_action); } /* Step2: Send Log to DOM0 through vIRQ */ @@ -425,20 +510,6 @@ static void mce_softirq(void) printk(KERN_DEBUG "MCE: send MCE# to DOM0 through virq\n"); send_guest_global_virq(dom0, VIRQ_MCA); } - - /* Step3: Inject vMCE to impacted DOM. Currently we cares DOM0 only */ - if (guest_has_trap_callback - (dom0, 0, TRAP_machine_check) && - !test_and_set_bool(dom0->vcpu[0]->mce_pending)) { - dom0->vcpu[0]->cpu_affinity_tmp = - dom0->vcpu[0]->cpu_affinity; - cpus_clear(affinity); - cpu_set(cpu, affinity); - printk(KERN_DEBUG "MCE: CPU%d set affinity, old %d\n", cpu, - dom0->vcpu[0]->processor); - vcpu_set_affinity(dom0->vcpu[0], &affinity); - vcpu_kick(dom0->vcpu[0]); - } } mce_barrier_exit(&mce_inside_bar); @@ -1057,7 +1128,27 @@ int intel_mce_wrmsr(u32 msr, u64 value) break; case MSR_IA32_MCG_STATUS: d->arch.vmca_msrs.mcg_status = value; - gdprintk(XENLOG_DEBUG, "MCE: wrmsr MCG_CTL %"PRIx64"\n", value); + gdprintk(XENLOG_DEBUG, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", value); + /* For HVM guest, this is the point for deleting vMCE injection node */ + if ( (d->is_hvm) && (d->arch.vmca_msrs.nr_injection >0) ) + { + d->arch.vmca_msrs.nr_injection--; /* Should be 0 */ + if (!list_empty(&d->arch.vmca_msrs.impact_header)) { + entry = list_entry(d->arch.vmca_msrs.impact_header.next, + struct bank_entry, list); + if (entry->mci_status & MCi_STATUS_VAL) + gdprintk(XENLOG_ERR, "MCE: MCi_STATUS MSR should have " + "been cleared before write MCG_STATUS MSR\n"); + + gdprintk(XENLOG_DEBUG, "MCE: Delete HVM last injection " + "Node, nr_injection %u\n", + d->arch.vmca_msrs.nr_injection); + list_del(&entry->list); + } + else + gdprintk(XENLOG_DEBUG, "MCE: Not found HVM guest" + " last injection Node, something Wrong!\n"); + } break; case MSR_IA32_MCG_CAP: gdprintk(XENLOG_WARNING, "MCE: MCG_CAP is read-only\n"); diff --git a/xen/arch/x86/cpu/mcheck/mctelem.c b/xen/arch/x86/cpu/mcheck/mctelem.c index d8dd482bc6..28b220f6e9 100644 --- a/xen/arch/x86/cpu/mcheck/mctelem.c +++ b/xen/arch/x86/cpu/mcheck/mctelem.c @@ -153,7 +153,7 @@ void mctelem_defer(mctelem_cookie_t cookie) } void mctelem_process_deferred(unsigned int cpu, - int (*fn)(unsigned int, mctelem_cookie_t)) + int (*fn)(mctelem_cookie_t)) { struct mctelem_ent *tep; struct mctelem_ent *head, *prev; @@ -189,7 +189,7 @@ void mctelem_process_deferred(unsigned int cpu, prev = tep->mcte_prev; tep->mcte_next = tep->mcte_prev = NULL; - ret = fn(cpu, MCTE2COOKIE(tep)); + ret = fn(MCTE2COOKIE(tep)); if (prev != NULL) prev->mcte_next = NULL; tep->mcte_prev = tep->mcte_next = NULL; diff --git a/xen/arch/x86/cpu/mcheck/mctelem.h b/xen/arch/x86/cpu/mcheck/mctelem.h index fb381a10a0..04edf98656 100644 --- a/xen/arch/x86/cpu/mcheck/mctelem.h +++ b/xen/arch/x86/cpu/mcheck/mctelem.h @@ -69,7 +69,7 @@ extern void mctelem_consume_oldest_end(mctelem_cookie_t); extern void mctelem_ack(mctelem_class_t, mctelem_cookie_t); extern void mctelem_defer(mctelem_cookie_t); extern void mctelem_process_deferred(unsigned int, - int (*)(unsigned int, mctelem_cookie_t)); + int (*)(mctelem_cookie_t)); int mctelem_has_deferred(unsigned int); #endif diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index a7f8cc1c54..1060e75a7a 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -1771,6 +1771,8 @@ void hvm_rdtsc_intercept(struct cpu_user_regs *regs) regs->edx = (uint32_t)(tsc >> 32); } +extern int intel_mce_rdmsr(u32 msr, u32 *lo, u32 *hi); +extern int intel_mce_wrmsr(u32 msr, u64 value); int hvm_msr_read_intercept(struct cpu_user_regs *regs) { uint32_t ecx = regs->ecx; @@ -1779,6 +1781,8 @@ int hvm_msr_read_intercept(struct cpu_user_regs *regs) uint64_t *var_range_base, *fixed_range_base; int index, mtrr; uint32_t cpuid[4]; + uint32_t lo, hi; + int ret; var_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.var_ranges; fixed_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.fixed_ranges; @@ -1796,18 +1800,6 @@ int hvm_msr_read_intercept(struct cpu_user_regs *regs) msr_content = vcpu_vlapic(v)->hw.apic_base_msr; break; - case MSR_IA32_MCG_CAP: - case MSR_IA32_MCG_STATUS: - case MSR_IA32_MC0_STATUS: - case MSR_IA32_MC1_STATUS: - case MSR_IA32_MC2_STATUS: - case MSR_IA32_MC3_STATUS: - case MSR_IA32_MC4_STATUS: - case MSR_IA32_MC5_STATUS: - /* No point in letting the guest see real MCEs */ - msr_content = 0; - break; - case MSR_IA32_CR_PAT: msr_content = v->arch.hvm_vcpu.pat_cr; break; @@ -1858,7 +1850,17 @@ int hvm_msr_read_intercept(struct cpu_user_regs *regs) break; default: - return hvm_funcs.msr_read_intercept(regs); + ret = intel_mce_rdmsr(ecx, &lo, &hi); + if ( ret < 0 ) + goto gp_fault; + else if ( ret ) + { + msr_content = ((u64)hi << 32) | lo; + break; + } + /* ret == 0, This is not an MCE MSR, see other MSRs */ + else if (!ret) + return hvm_funcs.msr_read_intercept(regs); } regs->eax = (uint32_t)msr_content; @@ -1884,6 +1886,7 @@ int hvm_msr_write_intercept(struct cpu_user_regs *regs) struct vcpu *v = current; int index, mtrr; uint32_t cpuid[4]; + int ret; hvm_cpuid(1, &cpuid[0], &cpuid[1], &cpuid[2], &cpuid[3]); mtrr = !!(cpuid[3] & bitmaskof(X86_FEATURE_MTRR)); @@ -1946,7 +1949,13 @@ int hvm_msr_write_intercept(struct cpu_user_regs *regs) break; default: - return hvm_funcs.msr_write_intercept(regs); + ret = intel_mce_wrmsr(ecx, msr_content); + if ( ret < 0 ) + goto gp_fault; + else if ( ret ) + break; + else if (!ret) + return hvm_funcs.msr_write_intercept(regs); } return X86EMUL_OKAY; diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c index bd2f145921..90e1f45174 100644 --- a/xen/arch/x86/hvm/irq.c +++ b/xen/arch/x86/hvm/irq.c @@ -326,6 +326,9 @@ struct hvm_intack hvm_vcpu_has_pending_irq(struct vcpu *v) if ( unlikely(v->nmi_pending) ) return hvm_intack_nmi; + if ( unlikely(v->mce_pending) ) + return hvm_intack_mce; + if ( vlapic_accept_pic_intr(v) && plat->vpic[0].int_output ) return hvm_intack_pic(0); @@ -347,6 +350,10 @@ struct hvm_intack hvm_vcpu_ack_pending_irq( if ( !test_and_clear_bool(v->nmi_pending) ) intack = hvm_intack_none; break; + case hvm_intsrc_mce: + if ( !test_and_clear_bool(v->mce_pending) ) + intack = hvm_intack_none; + break; case hvm_intsrc_pic: ASSERT(v->vcpu_id == 0); if ( (vector = vpic_ack_pending_irq(v)) == -1 ) diff --git a/xen/arch/x86/hvm/vmx/intr.c b/xen/arch/x86/hvm/vmx/intr.c index afd9d41919..ef536ac8f2 100644 --- a/xen/arch/x86/hvm/vmx/intr.c +++ b/xen/arch/x86/hvm/vmx/intr.c @@ -157,6 +157,10 @@ asmlinkage void vmx_intr_assist(void) { vmx_inject_nmi(); } + else if ( intack.source == hvm_intsrc_mce ) + { + vmx_inject_hw_exception(TRAP_machine_check, HVM_DELIVER_NO_ERROR_CODE); + } else { HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0); diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c index 70fef66f84..5773a1b22f 100644 --- a/xen/arch/x86/x86_64/traps.c +++ b/xen/arch/x86/x86_64/traps.c @@ -309,12 +309,13 @@ unsigned long do_iret(void) && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity)) vcpu_set_affinity(v, &v->cpu_affinity_tmp); - /*Currently, only inject vMCE to DOM0.*/ + /* inject vMCE to PV_Guest including DOM0. */ if (v->trap_priority >= VCPU_TRAP_NMI) { - printk(KERN_DEBUG "MCE: Return from vMCE# trap!"); - if (d->domain_id == 0 && v->vcpu_id == 0) { + printk(KERN_DEBUG "MCE: Return from vMCE# trap!\n"); + if ( v->vcpu_id == 0 ) { if ( !d->arch.vmca_msrs.nr_injection ) { - printk(KERN_WARNING "MCE: Ret from vMCE#, nr_injection is 0\n"); + printk(KERN_WARNING "MCE: Ret from vMCE#, " + "No injection Node\n"); goto end; } diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index 93ddceecf7..92c9a9d375 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -210,7 +210,6 @@ struct p2m_domain; * put into impact_header list. */ struct bank_entry { struct list_head list; - int32_t cpu; uint16_t bank; uint64_t mci_status; uint64_t mci_addr; diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h index d85a0f61c5..4557cd0dcd 100644 --- a/xen/include/asm-x86/hvm/hvm.h +++ b/xen/include/asm-x86/hvm/hvm.h @@ -31,7 +31,8 @@ enum hvm_intsrc { hvm_intsrc_none, hvm_intsrc_pic, hvm_intsrc_lapic, - hvm_intsrc_nmi + hvm_intsrc_nmi, + hvm_intsrc_mce }; struct hvm_intack { uint8_t source; /* enum hvm_intsrc */ @@ -41,6 +42,7 @@ struct hvm_intack { #define hvm_intack_pic(vec) ( (struct hvm_intack) { hvm_intsrc_pic, vec } ) #define hvm_intack_lapic(vec) ( (struct hvm_intack) { hvm_intsrc_lapic, vec } ) #define hvm_intack_nmi ( (struct hvm_intack) { hvm_intsrc_nmi, 2 } ) +#define hvm_intack_mce ( (struct hvm_intack) { hvm_intsrc_mce, 18 } ) enum hvm_intblk { hvm_intblk_none, /* not blocked (deliverable) */ hvm_intblk_shadow, /* MOV-SS or STI shadow */